library(tidyverse)
library(ggplot2)
#install.packages("dplyr")
library(dplyr)Assginment
Data handlings and high-quality illustration for publication
About input data
In this assignment, I will only use two database: eruptions and volcano.
The volcano database contains properties of each volcano in the world, while the eruptions database presents eruption events of them.
Both database contain latitude and longitue, which can be plot on the map.
Load necessary library and read the data
#read data from csv
eruptions <- read.csv("eruptions.csv") #read eruption.csv
volcano <- read.csv("volcano.csv") #read volcano.csv
str(eruptions)'data.frame': 11178 obs. of 15 variables:
$ volcano_number : int 266030 343100 233020 345020 353010 273070 282050 241040 311060 284096 ...
$ volcano_name : chr "Soputan" "San Miguel" "Fournaise, Piton de la" "Rincon de la Vieja" ...
$ eruption_number : int 22354 22355 22343 22346 22347 22344 22345 22338 22341 22340 ...
$ eruption_category : chr "Confirmed Eruption" "Confirmed Eruption" "Confirmed Eruption" "Confirmed Eruption" ...
$ area_of_activity : chr NA NA NA NA ...
$ vei : int NA NA NA NA NA NA NA 2 NA 1 ...
$ start_year : int 2020 2020 2020 2020 2020 2020 2020 2019 2019 2019 ...
$ start_month : int 3 2 2 1 1 1 1 12 12 12 ...
$ start_day : int 23 22 10 31 12 12 11 9 7 5 ...
$ evidence_method_dating: chr "Historical Observations" "Historical Observations" "Historical Observations" "Historical Observations" ...
$ end_year : int 2020 2020 2020 2020 2020 2020 2020 2019 2020 2020 ...
$ end_month : int 4 2 4 4 1 1 4 12 3 4 ...
$ end_day : int 2 22 6 17 12 22 17 9 15 17 ...
$ latitude : num 1.11 13.43 -21.24 10.83 -0.37 ...
$ longitude : num 124.7 -88.3 55.7 -85.3 -91.5 ...
str(volcano) #check the data. 'data.frame': 958 obs. of 26 variables:
$ volcano_number : int 283001 355096 342080 213004 321040 283170 221170 221110 284160 342100 ...
$ volcano_name : chr "Abu" "Acamarachi" "Acatenango" "Acigol-Nevsehir" ...
$ primary_volcano_type : chr "Shield(s)" "Stratovolcano" "Stratovolcano(es)" "Caldera" ...
$ last_eruption_year : chr "-6850" "Unknown" "1972" "-2080" ...
$ country : chr "Japan" "Chile" "Guatemala" "Turkey" ...
$ region : chr "Japan, Taiwan, Marianas" "South America" "México and Central America" "Mediterranean and Western Asia" ...
$ subregion : chr "Honshu" "Northern Chile, Bolivia and Argentina" "Guatemala" "Turkey" ...
$ latitude : num 34.5 -23.3 14.5 38.5 46.2 ...
$ longitude : num 131.6 -67.6 -90.9 34.6 -121.5 ...
$ elevation : int 641 6023 3976 1683 3742 1728 1733 1250 965 3760 ...
$ tectonic_settings : chr "Subduction zone / Continental crust (>25 km)" "Subduction zone / Continental crust (>25 km)" "Subduction zone / Continental crust (>25 km)" "Intraplate / Continental crust (>25 km)" ...
$ evidence_category : chr "Eruption Dated" "Evidence Credible" "Eruption Observed" "Eruption Dated" ...
$ major_rock_1 : chr "Andesite / Basaltic Andesite" "Dacite" "Andesite / Basaltic Andesite" "Rhyolite" ...
$ major_rock_2 : chr "Basalt / Picro-Basalt" "Andesite / Basaltic Andesite" "Dacite" "Dacite" ...
$ major_rock_3 : chr "Dacite" " " " " "Basalt / Picro-Basalt" ...
$ major_rock_4 : chr " " " " " " "Andesite / Basaltic Andesite" ...
$ major_rock_5 : chr " " " " " " " " ...
$ minor_rock_1 : chr " " " " "Basalt / Picro-Basalt" " " ...
$ minor_rock_2 : chr " " " " " " " " ...
$ minor_rock_3 : chr " " " " " " " " ...
$ minor_rock_4 : chr " " " " " " " " ...
$ minor_rock_5 : chr " " " " " " " " ...
$ population_within_5_km : int 3597 0 4329 127863 0 428 101 51 0 9890 ...
$ population_within_10_km : int 9594 7 60730 127863 70 3936 485 6042 0 114404 ...
$ population_within_30_km : int 117805 294 1042836 218469 4019 717078 18645 8611 0 2530449 ...
$ population_within_100_km: int 4071152 9092 7634778 2253483 393303 5024654 1242922 161009 0 7441660 ...
Manipulate data frame
Since the database is big, it is necessary to filter the part of concern to present on graph. This time, I will consider only eruption events in 21st century (2001 until now).
eruptions_2000s <- eruptions %>%
filter(eruption_category == "Confirmed Eruption") %>% #only consider the confirmed eruptions
filter(start_year > 2000) %>% #cut parts of 21st century
filter(vei !="NA") %>% #only consider the known "vei"
dplyr::select(volcano_number,volcano_name, vei, start_year,start_month, start_day, end_year, end_month, end_day, longitude, latitude)
head(eruptions_2000s) volcano_number volcano_name vei start_year start_month start_day
1 241040 Whakaari/White Island 2 2019 12 9
2 284096 Nishinoshima 1 2019 12 5
3 243070 Lateiki 1 2019 10 13
4 290240 Sarychev Peak 2 2019 5 16
5 263310 Tengger Caldera 2 2019 2 18
6 256010 Tinakula 2 2018 12 8
end_year end_month end_day longitude latitude
1 2019 12 9 177.180 -37.520
2 2020 4 17 140.874 27.247
3 2019 10 22 -174.870 -19.180
4 2019 10 7 153.200 48.092
5 2019 7 28 112.950 -7.942
6 2020 4 11 165.804 -10.386
Make wide table
The eruptions database is purely long table. We can make a wide table that is easier to see.
eruption_wide <- eruptions_2000s %>%
# dplyr::select(volcano_name, eruption_category, start_year) %>%
group_by(volcano_name) %>%
mutate(row = row_number()) %>%
dplyr::select(volcano_name,vei, start_year, row) %>%
pivot_wider(names_from = start_year, values_from = vei) %>%
dplyr::select(-row)
head(eruption_wide)# A tibble: 6 × 20
# Groups: volcano_name [6]
volcano_name `2019` `2018` `2017` `2016` `2015` `2014` `2013` `2012` `2011`
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 Whakaari/White… 2 NA NA NA NA NA NA NA NA
2 Nishinoshima 1 NA NA NA NA NA NA NA NA
3 Lateiki 1 NA NA NA NA NA NA NA NA
4 Sarychev Peak 2 NA NA NA NA NA NA NA NA
5 Tengger Caldera 2 NA NA NA NA NA NA NA NA
6 Tinakula NA 2 NA NA NA NA NA NA NA
# ℹ 10 more variables: `2010` <int>, `2009` <int>, `2008` <int>, `2007` <int>,
# `2006` <int>, `2005` <int>, `2004` <int>, `2003` <int>, `2002` <int>,
# `2001` <int>
Visuallization
Create graph of eruption events
In eruption database, there is two noticable numberic column vei and year, month, day of starting and ending moment of each event. I will use those data to present on a graph. First, I will merge [..]year, [..]month, [..]day column into one column start_date and end_date, then calculate the length of time in days between them, then plot. Note: The volcanic explosivity index (VEI) is a scale used to measure the size of explosive volcanic eruptions. It was devised by Christopher G. Newhall of the United States Geological Survey and Stephen Self in 1982. VEI of 0.5 is 0.0001 km3 ejecta, not 0.00001 km3.Scale is logarithmic (powers of 10), see official USGS VEI scale. Source: https://en.wikipedia.org/wiki/Volcanic_explosivity_index# |
By chris - Own work based on: https://volcanoes.usgs.gov/Products/Pglossary/vei.html, Public Domain, https://commons.wikimedia.org/w/index.php?curid=3935253 |
eruptions_2000s <- eruptions_2000s %>%
mutate('start_date'=make_date(year=start_year, month=start_month, day=start_day)) %>%
mutate('end_date'=make_date(year=end_year, month=end_month, day=end_day)) %>%
mutate('daycount' = difftime(end_date, start_date, units = "days")) %>%
mutate('daycount' = daycount+1) #correct the day count (not only time different)
library(RColorBrewer)
plot1 <- eruptions_2000s %>% ggplot(mapping=aes(x=start_year, y=daycount)) +
geom_point(aes(size = vei, colour = vei))+
scale_x_continuous(name = "Year of eruption events") +
scale_y_continuous(name = "Length of eruption (days)")+
ggtitle("Eruption events in 21st century by year") +
theme(plot.title.position = "plot")
plot1
I want to add more information into the graph, so I will join eruption with volcano tables to extract more information about the volcano. Color of points is set to the type of primary volcano. Select “Set2” which is color blind-friendly.
#|warning: false
#|echo: true
eruptions_2000s_2 <- left_join(eruptions_2000s, volcano,
by = "volcano_name", relationship = "many-to-many")
#as.numeric(eruptions_2000s$daycount)
plot2 <- eruptions_2000s_2 %>% ggplot(mapping=aes(x=start_year, y=daycount)) +
geom_point(aes(size = vei, colour = primary_volcano_type))+
scale_colour_brewer(palette = "Set2") +
scale_x_continuous(name = "Year of eruption events") +
scale_y_continuous(name = "Length of eruption (days)")+
ggtitle("Eruption events in 21st century by year") +
theme(plot.title.position = "plot")
plot2Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning: Removed 524 rows containing missing values or values outside the scale range
(`geom_point()`).

The problem appears when there are some similar values of primary_volcano_type due to the typos. This needs to be clean before plotting.
eruptions_2000s_3<- eruptions_2000s_2 %>%
mutate(
primary_volcano_type= str_replace_all(primary_volcano_type,
fixed(c('Stratovolcano?' = 'Stratovolcano',
'Stratovolcano(es)' = 'Stratovolcano',
'Pyroclastic cone(s)'= 'Pyroclastic cone',
'Shield(s)' = 'Shield',
'Lava dome(s)'= 'Lava dome',
'Lava cone(es)' = 'Lava cone',
'Lava cone(s)'= 'Lava cone',
'Tuff cone(s)' = 'Tuff cone',
'Complex(es)' = 'Complex'))))
unique(eruptions_2000s_3$primary_volcano_type) #check the values[1] "Stratovolcano" NA "Complex"
[4] "Shield" "Caldera" "Submarine"
[7] "Pyroclastic shield" "Fissure vent(s)" "Lava dome"
Then re-do the plotting. But since “Set2” palette has only 8 colors while the field primary_volcano_type has 9 value, including NA, I will change the color palette, which is still colorblind-friendly with more value.
plot3 <- eruptions_2000s_3 %>% ggplot(mapping=aes(x=start_year, y=daycount)) +
geom_point(aes(size = vei, colour = primary_volcano_type))+
scale_colour_viridis_d(option = "D") +
scale_x_continuous(name = "Year of eruption events") +
scale_y_continuous(name = "Length of eruption (days)")+
ggtitle("Eruption events in 21st century by year") +
theme(plot.title.position = "plot")
plot3
Present the eruption events in 2000s on interactive map
#Load necessary packages
install.load::install_load( "raster", "sf","maps", "terra", "ggspatial", "mapview")
knitr::opts_chunk$set(warning = TRUE, # show warnings
message = TRUE, # show messages
error = TRUE, # do not interrupt generation in case of errors,
echo = TRUE)
# Load packages
pcks <- list("dplyr",
"sf", # used for handling spatial data
"ggspatial", # spatial for ggplot2
"mapview", # interactive maps
"tidyverse")eruptions_2000s_shp <- st_as_sf(eruptions_2000s_3,
coords = c("longitude.x", "latitude.x"),
crs = 4326) #CRS for world map (WGS84)
pal <- mapviewPalette("mapviewVectorColors") #set colour pallet.
map1 <- eruptions_2000s_shp %>%
mapview(zcol="primary_volcano_type", #colour by volcano type
col.regions = pal(8),
cex = "vei", #size of points by vei
alpha = 0.5,
legend = TRUE)
map1Present location of all volcanoes
Instead of showing map of eruptions in 21st century, we can also show another map with all volcano in the world:
volcano_points <- st_as_sf(volcano,
coords = c("longitude", "latitude"),
crs = 4326) #CRS for world map (WGS84)
volcano_points2<- volcano_points %>%
mutate(primary_volcano_type=
str_replace_all(primary_volcano_type, #clean data, similar to above
fixed(c('Stratovolcano?' = 'Stratovolcano',
'Stratovolcano(es)' = 'Stratovolcano',
'Pyroclastic cone(s)'= 'Pyroclastic cone',
'Shield(s)' = 'Shield',
'Lava dome(s)'= 'Lava dome',
'Lava cone(es)' = 'Lava cone',
'Lava cone(s)'= 'Lava cone',
'Tuff cone(s)' = 'Tuff cone',
'Complex(es)' = 'Complex',
'Caldera(s)' = 'Caldera'))))
unique(volcano_points2$primary_volcano_type) #check the values again [1] "Shield" "Stratovolcano" "Caldera"
[4] "Submarine" "Volcanic field" "Fissure vent(s)"
[7] "Compound" "Complex" "Pyroclastic shield"
[10] "Pyroclastic cone" "Lava dome" "Lava cone"
[13] "Crater rows" "Maar(s)" "Tuff cone"
[16] "Subglacial"
pal2 <- mapviewPalette("mapviewVectorColors") #set colour palette.
map2 <- volcano_points2 %>%
mapview(zcol="primary_volcano_type", #colour by regions
col.regions = pal(14), #there are 14 unique values of types
cex = 3,
stroke = 0.5,
alpha = 0.5,
legend = TRUE
)
map2Reflection
What I learnt:
- With the complete new dataset, it takes time in the beginning to explore and understand the data.
- There are challenges of cleaning data before working on, I learnt from internet sources.
- Therefore, there is a lot of things that I can improve: data cleaning, customizing the presentation, add more necessary elements to maps and graph.


.png)

.png)